L’objectif de ce projet est de prédire le parti politique gagnant des élections
présidentielles de 2020 aux États-Unis à partir de données socio-démographiques.
# # Installer la bibliothèque xlrd pour lire les fichiers excel de type "xls"
# !pip install xlrd
# # Installer la bibliothèque shap pour réaliser l'analyse locale des variables
# !pip install shap
# Importation des bibliothèques nécessaires
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report
# Charger les donnees
resultat_2020=pd.read_csv("data/2020_US_County_Level_Presidential_Results.csv")
historique_presidentielle=pd.read_csv("data/US_County_Level_Presidential_Results_08-16.csv")
education=pd.read_excel("data/Education.xls", sheet_name="Education 1970 to 2019")
population=pd.read_excel("data/PopulationEstimates.xls", sheet_name="Population Estimates 2010-19")
chomage=pd.read_excel("data/Unemployment.xls", sheet_name="Unemployment Med HH Income")
pauvrete=pd.read_excel("data/PovertyEstimates.xls", sheet_name="Poverty Data 2019")
# Affichez les dimensions du jeu de données resultat_2020
print(f"Le jeu de données a {resultat_2020.shape[0]} lignes et {resultat_2020.shape[1]} colonnes")
# Affichez les 5 premières lignes
resultat_2020.head(5)
Le jeu de données a 3152 lignes et 10 colonnes
| state_name | county_fips | county_name | votes_gop | votes_dem | total_votes | diff | per_gop | per_dem | per_point_diff | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Alabama | 1001 | Autauga County | 19838 | 7503 | 27770 | 12335 | 0.714368 | 0.270184 | 0.444184 |
| 1 | Alabama | 1003 | Baldwin County | 83544 | 24578 | 109679 | 58966 | 0.761714 | 0.224090 | 0.537623 |
| 2 | Alabama | 1005 | Barbour County | 5622 | 4816 | 10518 | 806 | 0.534512 | 0.457882 | 0.076631 |
| 3 | Alabama | 1007 | Bibb County | 7525 | 1986 | 9595 | 5539 | 0.784263 | 0.206983 | 0.577280 |
| 4 | Alabama | 1009 | Blount County | 24711 | 2640 | 27588 | 22071 | 0.895716 | 0.095694 | 0.800022 |
#afficher les info du dataset
resultat_2020.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3152 entries, 0 to 3151 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 state_name 3152 non-null object 1 county_fips 3152 non-null int64 2 county_name 3152 non-null object 3 votes_gop 3152 non-null int64 4 votes_dem 3152 non-null int64 5 total_votes 3152 non-null int64 6 diff 3152 non-null int64 7 per_gop 3152 non-null float64 8 per_dem 3152 non-null float64 9 per_point_diff 3152 non-null float64 dtypes: float64(3), int64(5), object(2) memory usage: 246.4+ KB
# verification valeur manquante
resultat_2020.isna().any()
state_name False county_fips False county_name False votes_gop False votes_dem False total_votes False diff False per_gop False per_dem False per_point_diff False dtype: bool
# verification de valeur dupliquée
resultat_2020.duplicated().any()
False
#fonction pour creer la variable cible en fonction du pourcentage de chaque partie
def recodage(x, y):
if x>y:
return 1
else:
return 0
0 Pour le Parti Démocrate
1 pour le Parti Républicain
#création de la variable cible
Target= pd.DataFrame(resultat_2020.apply(lambda row: recodage(row["votes_dem"], row["votes_gop"]), axis=1), columns=["Target"])
# ajout de la variable "country_fips"
Target["county_fips"]= resultat_2020["county_fips"]
# afficher le dataset
Target
| Target | county_fips | |
|---|---|---|
| 0 | 0 | 1001 |
| 1 | 0 | 1003 |
| 2 | 0 | 1005 |
| 3 | 0 | 1007 |
| 4 | 0 | 1009 |
| ... | ... | ... |
| 3147 | 0 | 56037 |
| 3148 | 1 | 56039 |
| 3149 | 0 | 56041 |
| 3150 | 0 | 56043 |
| 3151 | 0 | 56045 |
3152 rows × 2 columns
# Renommer la colonne 'colonne_initiale' en 'nouveau_nom_colonne'
education = education.rename(columns={'FIPS Code': 'county_fips'})
# Affichez les dimensions du jeu de données education
print(f"Le jeu de données a {education.shape[0]} lignes et {education.shape[1]} colonnes")
# Affichez les 5 premières lignes
education.head()
Le jeu de données a 3283 lignes et 47 colonnes
| county_fips | State | Area name | 2003 Rural-urban Continuum Code | 2003 Urban Influence Code | 2013 Rural-urban Continuum Code | 2013 Urban Influence Code | Less than a high school diploma, 1970 | High school diploma only, 1970 | Some college (1-3 years), 1970 | ... | Percent of adults completing some college or associate's degree, 2000 | Percent of adults with a bachelor's degree or higher, 2000 | Less than a high school diploma, 2015-19 | High school diploma only, 2015-19 | Some college or associate's degree, 2015-19 | Bachelor's degree or higher, 2015-19 | Percent of adults with less than a high school diploma, 2015-19 | Percent of adults with a high school diploma only, 2015-19 | Percent of adults completing some college or associate's degree, 2015-19 | Percent of adults with a bachelor's degree or higher, 2015-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | US | United States | NaN | NaN | NaN | NaN | 52373312.0 | 34158051.0 | 11650730.0 | ... | 27.4 | 24.4 | 26472261.0 | 59472748.0 | 63756905.0 | 70920162.0 | 11.998918 | 26.956844 | 28.898697 | 32.145542 |
| 1 | 1000 | AL | Alabama | NaN | NaN | NaN | NaN | 1062306.0 | 468269.0 | 136287.0 | ... | 25.9 | 19.0 | 458922.0 | 1022839.0 | 993344.0 | 845772.0 | 13.819302 | 30.800268 | 29.912098 | 25.468332 |
| 2 | 1001 | AL | Autauga County | 2.0 | 2.0 | 2.0 | 2.0 | 6611.0 | 3757.0 | 933.0 | ... | 26.9 | 18.0 | 4291.0 | 12551.0 | 10596.0 | 9929.0 | 11.483395 | 33.588459 | 28.356571 | 26.571573 |
| 3 | 1003 | AL | Baldwin County | 4.0 | 5.0 | 3.0 | 2.0 | 18726.0 | 8426.0 | 2334.0 | ... | 29.3 | 23.1 | 13893.0 | 41797.0 | 47274.0 | 48148.0 | 9.193843 | 27.659616 | 31.284081 | 31.862459 |
| 4 | 1005 | AL | Barbour County | 6.0 | 6.0 | 6.0 | 6.0 | 8120.0 | 2242.0 | 581.0 | ... | 21.3 | 10.9 | 4812.0 | 6396.0 | 4676.0 | 2080.0 | 26.786907 | 35.604542 | 26.029837 | 11.578713 |
5 rows × 47 columns
# Renommer la colonne 'colonne_initiale' en 'nouveau_nom_colonne'
population = population.rename(columns={'FIPStxt': 'county_fips'})
# Affichez les dimensions du jeu de données resultat_2020
print(f"Le jeu de données a {population.shape[0]} lignes et {population.shape[1]} colonnes")
# Affichez les 5 premières lignes
population.head(5)
Le jeu de données a 3273 lignes et 165 colonnes
| county_fips | State | Area_Name | Rural-urban_Continuum Code_2003 | Rural-urban_Continuum Code_2013 | Urban_Influence_Code_2003 | Urban_Influence_Code_2013 | Economic_typology_2015 | CENSUS_2010_POP | ESTIMATES_BASE_2010 | ... | R_DOMESTIC_MIG_2019 | R_NET_MIG_2011 | R_NET_MIG_2012 | R_NET_MIG_2013 | R_NET_MIG_2014 | R_NET_MIG_2015 | R_NET_MIG_2016 | R_NET_MIG_2017 | R_NET_MIG_2018 | R_NET_MIG_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | US | United States | NaN | NaN | NaN | NaN | NaN | 308745538 | 308758105 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 1000 | AL | Alabama | NaN | NaN | NaN | NaN | NaN | 4779736 | 4780125 | ... | 1.917501 | 0.578434 | 1.186314 | 1.522549 | 0.563489 | 0.626357 | 0.745172 | 1.090366 | 1.773786 | 2.483744 |
| 2 | 1001 | AL | Autauga County | 2.0 | 2.0 | 2.0 | 2.0 | 0.0 | 54571 | 54597 | ... | 4.847310 | 6.018182 | -6.226119 | -3.902226 | 1.970443 | -1.712875 | 4.777171 | 0.849656 | 0.540916 | 4.560062 |
| 3 | 1003 | AL | Baldwin County | 4.0 | 3.0 | 5.0 | 2.0 | 5.0 | 182265 | 182265 | ... | 24.017829 | 16.641870 | 17.488579 | 22.751474 | 20.184334 | 17.725964 | 21.279291 | 22.398256 | 24.727215 | 24.380567 |
| 4 | 1005 | AL | Barbour County | 6.0 | 6.0 | 6.0 | 6.0 | 3.0 | 27457 | 27455 | ... | -5.690302 | 0.292676 | -6.897817 | -8.132185 | -5.140431 | -15.724575 | -18.238016 | -24.998528 | -8.754922 | -5.165664 |
5 rows × 165 columns
# Renommer la colonne 'colonne_initiale' en 'nouveau_nom_colonne'
chomage = chomage.rename(columns={'fips_txt': 'county_fips'})
# Affichez les dimensions du jeu de données
print(f"Le jeu de données a {chomage.shape[0]} lignes et {chomage.shape[1]} colonnes")
# Affichez les 5 premières lignes
chomage.head()
Le jeu de données a 3275 lignes et 88 colonnes
| county_fips | Stabr | area_name | Rural_urban_continuum_code_2013 | Urban_influence_code_2013 | Metro_2013 | Civilian_labor_force_2000 | Employed_2000 | Unemployed_2000 | Unemployment_rate_2000 | ... | Civilian_labor_force_2018 | Employed_2018 | Unemployed_2018 | Unemployment_rate_2018 | Civilian_labor_force_2019 | Employed_2019 | Unemployed_2019 | Unemployment_rate_2019 | Median_Household_Income_2019 | Med_HH_Income_Percent_of_State_Total_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | US | United States | NaN | NaN | NaN | 142601667.0 | 136904680.0 | 5696987.0 | 3.995035 | ... | 161389026.0 | 155102319.0 | 6286707.0 | 3.895375 | 163100055.0 | 157115247.0 | 5984808.0 | 3.669409 | 65712.0 | NaN |
| 1 | 1000 | AL | Alabama | NaN | NaN | NaN | 2133223.0 | 2035594.0 | 97629.0 | 4.600000 | ... | 2216627.0 | 2130845.0 | 85782.0 | 3.900000 | 2241747.0 | 2174483.0 | 67264.0 | 3.000000 | 51771.0 | 100.000000 |
| 2 | 1001 | AL | Autauga County, AL | 2.0 | 2.0 | 1.0 | 21720.0 | 20846.0 | 874.0 | 4.000000 | ... | 26196.0 | 25261.0 | 935.0 | 3.600000 | 26172.0 | 25458.0 | 714.0 | 2.700000 | 58233.0 | 112.481888 |
| 3 | 1003 | AL | Baldwin County, AL | 3.0 | 2.0 | 1.0 | 69533.0 | 66971.0 | 2562.0 | 3.700000 | ... | 95233.0 | 91809.0 | 3424.0 | 3.600000 | 97328.0 | 94675.0 | 2653.0 | 2.700000 | 59871.0 | 115.645828 |
| 4 | 1005 | AL | Barbour County, AL | 6.0 | 6.0 | 0.0 | 11373.0 | 10748.0 | 625.0 | 5.500000 | ... | 8414.0 | 7987.0 | 427.0 | 5.100000 | 8537.0 | 8213.0 | 324.0 | 3.800000 | 35972.0 | 69.482918 |
5 rows × 88 columns
# Renommer la colonne 'colonne_initiale' en 'nouveau_nom_colonne'
pauvrete = pauvrete.rename(columns={'FIPStxt': 'county_fips'})
# Affichez les dimensions du jeu de données resultat_2020
print(f"Le jeu de données a {pauvrete.shape[0]} lignes et {pauvrete.shape[1]} colonnes")
# Affichez les 5 premières lignes
pauvrete.head()
Le jeu de données a 3193 lignes et 34 colonnes
| county_fips | Stabr | Area_name | Rural-urban_Continuum_Code_2003 | Urban_Influence_Code_2003 | Rural-urban_Continuum_Code_2013 | Urban_Influence_Code_2013 | POVALL_2019 | CI90LBALL_2019 | CI90UBALL_2019 | ... | CI90UB517P_2019 | MEDHHINC_2019 | CI90LBINC_2019 | CI90UBINC_2019 | POV04_2019 | CI90LB04_2019 | CI90UB04_2019 | PCTPOV04_2019 | CI90LB04P_2019 | CI90UB04P_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | US | United States | NaN | NaN | NaN | NaN | 39490096 | 39248096 | 39732096 | ... | 16.0 | 65712 | 65594 | 65830 | 3457689.0 | 3405854.0 | 3509524.0 | 18.2 | 17.9 | 18.5 |
| 1 | 1000 | AL | Alabama | NaN | NaN | NaN | NaN | 747478 | 730491 | 764465 | ... | 21.6 | 51771 | 51179 | 52363 | 69236.0 | 65296.0 | 73176.0 | 24.2 | 22.8 | 25.6 |
| 2 | 1001 | AL | Autauga County | 2.0 | 2.0 | 2.0 | 2.0 | 6723 | 5517 | 7929 | ... | 19.4 | 58233 | 52517 | 63949 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 1003 | AL | Baldwin County | 4.0 | 5.0 | 3.0 | 2.0 | 22360 | 18541 | 26179 | ... | 17.2 | 59871 | 54593 | 65149 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 1005 | AL | Barbour County | 6.0 | 6.0 | 6.0 | 6.0 | 5909 | 4787 | 7031 | ... | 49.0 | 35972 | 31822 | 40122 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 34 columns
# Concaténation des datasets en se basant sur la colonne 'county_fips'
data = pd.merge(education, population, on='county_fips')
data = pd.merge(data, chomage, on='county_fips')
data = pd.merge(data, pauvrete, on='county_fips')
data = pd.merge(data, Target, on='county_fips')
# Réinitialisation de l'index
data=data.reset_index(drop=True)
# Affichez les dimensions du jeu de données
print(f"Le jeu de données a {data.shape[0]} lignes et {data.shape[1]} colonnes")
# Affichez les 5 premières lignes
data.head()
Le jeu de données a 3112 lignes et 332 colonnes
| county_fips | State_x | Area name | 2003 Rural-urban Continuum Code | 2003 Urban Influence Code | 2013 Rural-urban Continuum Code | 2013 Urban Influence Code | Less than a high school diploma, 1970 | High school diploma only, 1970 | Some college (1-3 years), 1970 | ... | MEDHHINC_2019 | CI90LBINC_2019 | CI90UBINC_2019 | POV04_2019 | CI90LB04_2019 | CI90UB04_2019 | PCTPOV04_2019 | CI90LB04P_2019 | CI90UB04P_2019 | Target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001 | AL | Autauga County | 2.0 | 2.0 | 2.0 | 2.0 | 6611.0 | 3757.0 | 933.0 | ... | 58233 | 52517 | 63949 | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
| 1 | 1003 | AL | Baldwin County | 4.0 | 5.0 | 3.0 | 2.0 | 18726.0 | 8426.0 | 2334.0 | ... | 59871 | 54593 | 65149 | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
| 2 | 1005 | AL | Barbour County | 6.0 | 6.0 | 6.0 | 6.0 | 8120.0 | 2242.0 | 581.0 | ... | 35972 | 31822 | 40122 | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
| 3 | 1007 | AL | Bibb County | 1.0 | 1.0 | 1.0 | 1.0 | 5272.0 | 1402.0 | 238.0 | ... | 47918 | 42291 | 53545 | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
| 4 | 1009 | AL | Blount County | 1.0 | 1.0 | 1.0 | 1.0 | 10677.0 | 3440.0 | 626.0 | ... | 52902 | 46777 | 59027 | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
5 rows × 332 columns
#afficher les info du dataset
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3112 entries, 0 to 3111 Columns: 332 entries, county_fips to Target dtypes: float64(298), int64(26), object(8) memory usage: 7.9+ MB
# voir le pourcentage de valeur manquante pour chaque variable
data.isna().sum()/len(data)*100
county_fips 0.0
State_x 0.0
Area name 0.0
2003 Rural-urban Continuum Code 0.0
2003 Urban Influence Code 0.0
...
CI90UB04_2019 100.0
PCTPOV04_2019 100.0
CI90LB04P_2019 100.0
CI90UB04P_2019 100.0
Target 0.0
Length: 332, dtype: float64
# fonction pour obtenir la liste des variables ayant beaucoup de valeurs manquantes
def variables_manquantes(dataset, seuil):
"""
Cette fonction parcourt un dataset et renvoie une liste des variables
ayant un pourcentage de valeurs manquantes supérieur au seuil spécifié.
Args:
- dataset (DataFrame): Le dataset à analyser.
- seuil (float): Le seuil de pourcentage de valeurs manquantes à partir duquel
une variable est considérée comme ayant beaucoup de valeurs manquantes.
Returns:
- list: Une liste des noms de variables ayant beaucoup de valeurs manquantes.
"""
variables_manquantes = []
total_lignes = len(dataset)
for colonne in dataset.columns:
nb_valeurs_manquantes = dataset[colonne].isnull().sum()
pourcentage_manquant = (nb_valeurs_manquantes / total_lignes) * 100
if pourcentage_manquant > seuil:
variables_manquantes.append(colonne)
return variables_manquantes
# Appeler la fonction pour obtenir la liste des variables ayant beaucoup de valeurs manquantes
variables_manquantes_liste = variables_manquantes(data, 50)
# Afficher la liste des variables ayant beaucoup de valeurs manquantes
print("Variables avec beaucoup de valeurs manquantes : ", variables_manquantes_liste)
Variables avec beaucoup de valeurs manquantes : ['POV04_2019', 'CI90LB04_2019', 'CI90UB04_2019', 'PCTPOV04_2019', 'CI90LB04P_2019', 'CI90UB04P_2019']
#suppression des colonnes ayant trop de valeur manquante
data.drop(variables_manquantes_liste,axis=1, inplace=True)
print(f"Le jeu de données a désormais {data.shape[0]} lignes et {data.shape[1]} colonnes")
Le jeu de données a désormais 3112 lignes et 326 colonnes
# Sélectionner uniquement les colonnes numériques
numeric_columns = data.select_dtypes(include=np.number)
# Initialisation de l'imputeur pour les valeurs numériques
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
# Imputation sur les valeurs numériques
data[numeric_columns.columns] = imp_mean.fit_transform(numeric_columns)
#verification de valeur dupliquée
data.duplicated().any()
False
#création d'un mask pour selectionner les colonne numerique de mon dataset
mask = data.select_dtypes(exclude="object")
# Afficher un boxplot des variables pour detecter visuellement la presence d'un outlier
plt.figure(figsize=(30, 80))
sns.boxplot(data=mask, orient='h')
plt.title(' boxplot des variables')
plt.show()
# fontion pour voir les outliers dans un DataFrame
def show_possible_outliers(df, column):
"""
Détecte les outliers dans une colonne numérique d'un DataFrame selon la méthode de l'IQR.
Inputs :
--------
- df : DataFrame
- column : Nom de la colonne numérique (str)
Outputs :
---------
Renvoie la liste des index des possibles outliers ainsi que les limite basse et haute
"""
# 1er Quartile
Q1 = np.nanpercentile(df[column], 25)
# 3ème Quartile
Q3 = np.nanpercentile(df[column], 75)
# Inter-Quartile Range (IQR)
IQR = Q3 - Q1
# limites, basse & haute
outer_fence = IQR * 1.5
outer_lower = Q1 - outer_fence
outer_upper = Q3 + outer_fence
# Détection des outliers potentiels
mask = (df[column] < outer_lower) | (df[column] > outer_upper)
# Stockage de leurs indices
possible_outlier_index = df[mask].index
# Passage sous forme de list
outliers_index = possible_outlier_index.tolist()
return sorted(outliers_index), outer_lower, outer_upper
# Appel de la fonction pour détecter les outliers dans une colonne specifique
outliers_index, outer_lower, outer_upper = show_possible_outliers(data, "Less than a high school diploma, 1970")
# Définir l'option pour afficher toutes les colonnes
pd.set_option('display.max_columns', None)
# Afficher le DataFrame avec avec les index des valeurs aberrantes
print(f"Le DataFrame avec les valeurs aberrantes d'une colonne specifique :")
data.loc[outliers_index].head()
Le DataFrame avec les valeurs aberrantes d'une colonne specifique :
| county_fips | State_x | Area name | 2003 Rural-urban Continuum Code | 2003 Urban Influence Code | 2013 Rural-urban Continuum Code | 2013 Urban Influence Code | Less than a high school diploma, 1970 | High school diploma only, 1970 | Some college (1-3 years), 1970 | Four years of college or higher, 1970 | Percent of adults with less than a high school diploma, 1970 | Percent of adults with a high school diploma only, 1970 | Percent of adults completing some college (1-3 years), 1970 | Percent of adults completing four years of college or higher, 1970 | Less than a high school diploma, 1980 | High school diploma only, 1980 | Some college (1-3 years), 1980 | Four years of college or higher, 1980 | Percent of adults with less than a high school diploma, 1980 | Percent of adults with a high school diploma only, 1980 | Percent of adults completing some college (1-3 years), 1980 | Percent of adults completing four years of college or higher, 1980 | Less than a high school diploma, 1990 | High school diploma only, 1990 | Some college or associate's degree, 1990 | Bachelor's degree or higher, 1990 | Percent of adults with less than a high school diploma, 1990 | Percent of adults with a high school diploma only, 1990 | Percent of adults completing some college or associate's degree, 1990 | Percent of adults with a bachelor's degree or higher, 1990 | Less than a high school diploma, 2000 | High school diploma only, 2000 | Some college or associate's degree, 2000 | Bachelor's degree or higher, 2000 | Percent of adults with less than a high school diploma, 2000 | Percent of adults with a high school diploma only, 2000 | Percent of adults completing some college or associate's degree, 2000 | Percent of adults with a bachelor's degree or higher, 2000 | Less than a high school diploma, 2015-19 | High school diploma only, 2015-19 | Some college or associate's degree, 2015-19 | Bachelor's degree or higher, 2015-19 | Percent of adults with less than a high school diploma, 2015-19 | Percent of adults with a high school diploma only, 2015-19 | Percent of adults completing some college or associate's degree, 2015-19 | Percent of adults with a bachelor's degree or higher, 2015-19 | State_y | Area_Name | Rural-urban_Continuum Code_2003 | Rural-urban_Continuum Code_2013 | Urban_Influence_Code_2003_x | Urban_Influence_Code_2013_x | Economic_typology_2015 | CENSUS_2010_POP | ESTIMATES_BASE_2010 | POP_ESTIMATE_2010 | POP_ESTIMATE_2011 | POP_ESTIMATE_2012 | POP_ESTIMATE_2013 | POP_ESTIMATE_2014 | POP_ESTIMATE_2015 | POP_ESTIMATE_2016 | POP_ESTIMATE_2017 | POP_ESTIMATE_2018 | POP_ESTIMATE_2019 | N_POP_CHG_2010 | N_POP_CHG_2011 | N_POP_CHG_2012 | N_POP_CHG_2013 | N_POP_CHG_2014 | N_POP_CHG_2015 | N_POP_CHG_2016 | N_POP_CHG_2017 | N_POP_CHG_2018 | N_POP_CHG_2019 | Births_2010 | Births_2011 | Births_2012 | Births_2013 | Births_2014 | Births_2015 | Births_2016 | Births_2017 | Births_2018 | Births_2019 | Deaths_2010 | Deaths_2011 | Deaths_2012 | Deaths_2013 | Deaths_2014 | Deaths_2015 | Deaths_2016 | Deaths_2017 | Deaths_2018 | Deaths_2019 | NATURAL_INC_2010 | NATURAL_INC_2011 | NATURAL_INC_2012 | NATURAL_INC_2013 | NATURAL_INC_2014 | NATURAL_INC_2015 | NATURAL_INC_2016 | NATURAL_INC_2017 | NATURAL_INC_2018 | NATURAL_INC_2019 | INTERNATIONAL_MIG_2010 | INTERNATIONAL_MIG_2011 | INTERNATIONAL_MIG_2012 | INTERNATIONAL_MIG_2013 | INTERNATIONAL_MIG_2014 | INTERNATIONAL_MIG_2015 | INTERNATIONAL_MIG_2016 | INTERNATIONAL_MIG_2017 | INTERNATIONAL_MIG_2018 | INTERNATIONAL_MIG_2019 | DOMESTIC_MIG_2010 | DOMESTIC_MIG_2011 | DOMESTIC_MIG_2012 | DOMESTIC_MIG_2013 | DOMESTIC_MIG_2014 | DOMESTIC_MIG_2015 | DOMESTIC_MIG_2016 | DOMESTIC_MIG_2017 | DOMESTIC_MIG_2018 | DOMESTIC_MIG_2019 | NET_MIG_2010 | NET_MIG_2011 | NET_MIG_2012 | NET_MIG_2013 | NET_MIG_2014 | NET_MIG_2015 | NET_MIG_2016 | NET_MIG_2017 | NET_MIG_2018 | NET_MIG_2019 | RESIDUAL_2010 | RESIDUAL_2011 | RESIDUAL_2012 | RESIDUAL_2013 | RESIDUAL_2014 | RESIDUAL_2015 | RESIDUAL_2016 | RESIDUAL_2017 | RESIDUAL_2018 | RESIDUAL_2019 | GQ_ESTIMATES_BASE_2010 | GQ_ESTIMATES_2010 | GQ_ESTIMATES_2011 | GQ_ESTIMATES_2012 | GQ_ESTIMATES_2013 | GQ_ESTIMATES_2014 | GQ_ESTIMATES_2015 | GQ_ESTIMATES_2016 | GQ_ESTIMATES_2017 | GQ_ESTIMATES_2018 | GQ_ESTIMATES_2019 | R_birth_2011 | R_birth_2012 | R_birth_2013 | R_birth_2014 | R_birth_2015 | R_birth_2016 | R_birth_2017 | R_birth_2018 | R_birth_2019 | R_death_2011 | R_death_2012 | R_death_2013 | R_death_2014 | R_death_2015 | R_death_2016 | R_death_2017 | R_death_2018 | R_death_2019 | R_NATURAL_INC_2011 | R_NATURAL_INC_2012 | R_NATURAL_INC_2013 | R_NATURAL_INC_2014 | R_NATURAL_INC_2015 | R_NATURAL_INC_2016 | R_NATURAL_INC_2017 | R_NATURAL_INC_2018 | R_NATURAL_INC_2019 | R_INTERNATIONAL_MIG_2011 | R_INTERNATIONAL_MIG_2012 | R_INTERNATIONAL_MIG_2013 | R_INTERNATIONAL_MIG_2014 | R_INTERNATIONAL_MIG_2015 | R_INTERNATIONAL_MIG_2016 | R_INTERNATIONAL_MIG_2017 | R_INTERNATIONAL_MIG_2018 | R_INTERNATIONAL_MIG_2019 | R_DOMESTIC_MIG_2011 | R_DOMESTIC_MIG_2012 | R_DOMESTIC_MIG_2013 | R_DOMESTIC_MIG_2014 | R_DOMESTIC_MIG_2015 | R_DOMESTIC_MIG_2016 | R_DOMESTIC_MIG_2017 | R_DOMESTIC_MIG_2018 | R_DOMESTIC_MIG_2019 | R_NET_MIG_2011 | R_NET_MIG_2012 | R_NET_MIG_2013 | R_NET_MIG_2014 | R_NET_MIG_2015 | R_NET_MIG_2016 | R_NET_MIG_2017 | R_NET_MIG_2018 | R_NET_MIG_2019 | Stabr_x | area_name | Rural_urban_continuum_code_2013 | Urban_influence_code_2013 | Metro_2013 | Civilian_labor_force_2000 | Employed_2000 | Unemployed_2000 | Unemployment_rate_2000 | Civilian_labor_force_2001 | Employed_2001 | Unemployed_2001 | Unemployment_rate_2001 | Civilian_labor_force_2002 | Employed_2002 | Unemployed_2002 | Unemployment_rate_2002 | Civilian_labor_force_2003 | Employed_2003 | Unemployed_2003 | Unemployment_rate_2003 | Civilian_labor_force_2004 | Employed_2004 | Unemployed_2004 | Unemployment_rate_2004 | Civilian_labor_force_2005 | Employed_2005 | Unemployed_2005 | Unemployment_rate_2005 | Civilian_labor_force_2006 | Employed_2006 | Unemployed_2006 | Unemployment_rate_2006 | Civilian_labor_force_2007 | Employed_2007 | Unemployed_2007 | Unemployment_rate_2007 | Civilian_labor_force_2008 | Employed_2008 | Unemployed_2008 | Unemployment_rate_2008 | Civilian_labor_force_2009 | Employed_2009 | Unemployed_2009 | Unemployment_rate_2009 | Civilian_labor_force_2010 | Employed_2010 | Unemployed_2010 | Unemployment_rate_2010 | Civilian_labor_force_2011 | Employed_2011 | Unemployed_2011 | Unemployment_rate_2011 | Civilian_labor_force_2012 | Employed_2012 | Unemployed_2012 | Unemployment_rate_2012 | Civilian_labor_force_2013 | Employed_2013 | Unemployed_2013 | Unemployment_rate_2013 | Civilian_labor_force_2014 | Employed_2014 | Unemployed_2014 | Unemployment_rate_2014 | Civilian_labor_force_2015 | Employed_2015 | Unemployed_2015 | Unemployment_rate_2015 | Civilian_labor_force_2016 | Employed_2016 | Unemployed_2016 | Unemployment_rate_2016 | Civilian_labor_force_2017 | Employed_2017 | Unemployed_2017 | Unemployment_rate_2017 | Civilian_labor_force_2018 | Employed_2018 | Unemployed_2018 | Unemployment_rate_2018 | Civilian_labor_force_2019 | Employed_2019 | Unemployed_2019 | Unemployment_rate_2019 | Median_Household_Income_2019 | Med_HH_Income_Percent_of_State_Total_2019 | Stabr_y | Area_name | Rural-urban_Continuum_Code_2003 | Urban_Influence_Code_2003_y | Rural-urban_Continuum_Code_2013 | Urban_Influence_Code_2013_y | POVALL_2019 | CI90LBALL_2019 | CI90UBALL_2019 | PCTPOVALL_2019 | CI90LBALLP_2019 | CI90UBALLP_2019 | POV017_2019 | CI90LB017_2019 | CI90UB017_2019 | PCTPOV017_2019 | CI90LB017P_2019 | CI90UB017P_2019 | POV517_2019 | CI90LB517_2019 | CI90UB517_2019 | PCTPOV517_2019 | CI90LB517P_2019 | CI90UB517P_2019 | MEDHHINC_2019 | CI90LBINC_2019 | CI90UBINC_2019 | Target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7 | 1015.0 | AL | Calhoun County | 3.0 | 2.0 | 3.0 | 2.0 | 30535.0 | 13804.0 | 3823.0 | 3921.0 | 58.6 | 26.5 | 7.3 | 7.5 | 28268.0 | 21411.0 | 8449.0 | 7213.0 | 43.3 | 32.8 | 12.9 | 11.0 | 23633.0 | 21711.0 | 16835.0 | 10266.0 | 32.6 | 30.0 | 23.2 | 14.2 | 19318.0 | 23856.0 | 19576.0 | 11265.0 | 26.1 | 32.2 | 26.4 | 15.2 | 12315.0 | 25940.0 | 26233.0 | 14620.0 | 15.567326 | 32.790615 | 33.160995 | 18.481064 | AL | Calhoun County | 3.0 | 3.0 | 2.0 | 2.0 | 4.0 | 118572.0 | 118526.0 | 118408.0 | 117744.0 | 117190.0 | 116471.0 | 115917.0 | 115469.0 | 114973.0 | 114710.0 | 114331.0 | 113605.0 | -118.0 | -664.0 | -554.0 | -719.0 | -554.0 | -448.0 | -496.0 | -263.0 | -379.0 | -726.0 | 318.0 | 1385.0 | 1356.0 | 1309.0 | 1315.0 | 1388.0 | 1382.0 | 1324.0 | 1299.0 | 1269.0 | 311.0 | 1325.0 | 1359.0 | 1410.0 | 1395.0 | 1455.0 | 1475.0 | 1393.0 | 1616.0 | 1532.0 | 7.0 | 60.0 | -3.0 | -101.0 | -80.0 | -67.0 | -93.0 | -69.0 | -317.0 | -263.0 | -4.0 | 26.0 | 67.0 | 45.0 | 66.0 | 66.0 | 102.0 | 69.0 | 103.0 | 14.0 | -113.0 | -752.0 | -606.0 | -659.0 | -534.0 | -438.0 | -502.0 | -259.0 | -159.0 | -475.0 | -117.0 | -726.0 | -539.0 | -614.0 | -468.0 | -372.0 | -400.0 | -190.0 | -56.0 | -461.0 | -8.0 | 2.0 | -12.0 | -4.0 | -6.0 | -9.0 | -3.0 | -4.0 | -6.0 | -2.0 | 2933.0 | 2933.0 | 2882.0 | 2958.0 | 2814.0 | 2798.0 | 2775.0 | 2761.0 | 2743.0 | 2830.0 | 2833.0 | 11.729733 | 11.543668 | 11.204266 | 11.317280 | 11.997269 | 11.994341 | 11.528933 | 11.342947 | 11.134704 | 11.221586 | 11.569207 | 12.068766 | 12.005783 | 12.576388 | 12.801486 | 12.129761 | 14.111011 | 13.442370 | 0.508147 | -0.025539 | -0.864500 | -0.688504 | -0.579119 | -0.807145 | -0.600828 | -2.768063 | -2.307665 | 0.220197 | 0.570373 | 0.385173 | 0.568016 | 0.570475 | 0.885255 | 0.600828 | 0.899402 | 0.122841 | -6.368779 | -5.158896 | -5.640650 | -4.595762 | -3.785882 | -4.356845 | -2.255282 | -1.388398 | -4.167837 | -6.148582 | -4.588523 | -5.255477 | -4.027747 | -3.215406 | -3.471589 | -1.654454 | -0.488995 | -4.044995 | AL | Calhoun County, AL | 3.0 | 2.0 | 1.0 | 52896.0 | 50223.0 | 2673.0 | 5.1 | 51578.0 | 48744.0 | 2834.0 | 5.5 | 51952.0 | 48851.0 | 3101.0 | 6.0 | 52817.0 | 49673.0 | 3144.0 | 6.0 | 54026.0 | 51065.0 | 2961.0 | 5.5 | 53955.0 | 51524.0 | 2431.0 | 4.5 | 54063.0 | 51872.0 | 2191.0 | 4.1 | 54861.0 | 52709.0 | 2152.0 | 3.9 | 54564.0 | 51468.0 | 3096.0 | 5.7 | 53925.0 | 47929.0 | 5996.0 | 11.1 | 51559.0 | 45699.0 | 5860.0 | 11.4 | 51268.0 | 45963.0 | 5305.0 | 10.3 | 49649.0 | 45252.0 | 4397.0 | 8.9 | 48405.0 | 44163.0 | 4242.0 | 8.8 | 47237.0 | 43467.0 | 3770.0 | 8.0 | 45995.0 | 42764.0 | 3231.0 | 7.0 | 46013.0 | 43016.0 | 2997.0 | 6.5 | 45950.0 | 43679.0 | 2271.0 | 4.9 | 46307.0 | 44191.0 | 2116.0 | 4.6 | 46209.0 | 44574.0 | 1635.0 | 3.5 | 47747.0 | 92.227310 | AL | Calhoun County | 3.0 | 2.0 | 3.0 | 2.0 | 18988.0 | 15924.0 | 22052.0 | 17.2 | 14.4 | 20.0 | 5878.0 | 4553.0 | 7203.0 | 24.6 | 19.1 | 30.1 | 3994.0 | 2992.0 | 4996.0 | 22.8 | 17.1 | 28.5 | 47747.0 | 44016.0 | 51478.0 | 0.0 |
| 27 | 1055.0 | AL | Etowah County | 3.0 | 2.0 | 3.0 | 2.0 | 31249.0 | 14834.0 | 3927.0 | 2765.0 | 59.2 | 28.1 | 7.4 | 5.2 | 27842.0 | 20344.0 | 8053.0 | 5491.0 | 45.1 | 33.0 | 13.0 | 8.9 | 23588.0 | 20194.0 | 15201.0 | 6689.0 | 35.9 | 30.7 | 23.1 | 10.2 | 18115.0 | 22531.0 | 19811.0 | 9372.0 | 25.9 | 32.3 | 28.4 | 13.4 | 11127.0 | 23668.0 | 24422.0 | 12815.0 | 15.447301 | 32.857620 | 33.904377 | 17.790705 | AL | Etowah County | 3.0 | 3.0 | 2.0 | 2.0 | 0.0 | 104430.0 | 104429.0 | 104459.0 | 104369.0 | 104278.0 | 103884.0 | 103399.0 | 102998.0 | 102855.0 | 103007.0 | 102611.0 | 102268.0 | 30.0 | -90.0 | -91.0 | -394.0 | -485.0 | -401.0 | -143.0 | 152.0 | -396.0 | -343.0 | 260.0 | 1251.0 | 1156.0 | 1146.0 | 1204.0 | 1155.0 | 1214.0 | 1230.0 | 1188.0 | 1175.0 | 287.0 | 1355.0 | 1408.0 | 1347.0 | 1386.0 | 1380.0 | 1407.0 | 1379.0 | 1464.0 | 1487.0 | -27.0 | -104.0 | -252.0 | -201.0 | -182.0 | -225.0 | -193.0 | -149.0 | -276.0 | -312.0 | 10.0 | 33.0 | 18.0 | -7.0 | 15.0 | 38.0 | 63.0 | 29.0 | 37.0 | 37.0 | 60.0 | -15.0 | 161.0 | -169.0 | -305.0 | -204.0 | -5.0 | 278.0 | -154.0 | -62.0 | 70.0 | 18.0 | 179.0 | -176.0 | -290.0 | -166.0 | 58.0 | 307.0 | -117.0 | -25.0 | -13.0 | -4.0 | -18.0 | -17.0 | -13.0 | -10.0 | -8.0 | -6.0 | -3.0 | -6.0 | 2085.0 | 2085.0 | 2085.0 | 2085.0 | 2084.0 | 2085.0 | 2085.0 | 2085.0 | 2085.0 | 2086.0 | 2085.0 | 11.981152 | 11.080917 | 11.010655 | 11.616968 | 11.192023 | 11.794824 | 11.949753 | 11.555409 | 11.470185 | 12.977187 | 13.496480 | 12.941843 | 13.373021 | 13.372287 | 13.669949 | 13.397324 | 14.239998 | 14.515885 | -0.996035 | -2.415563 | -1.931188 | -1.756053 | -2.180264 | -1.875124 | -1.447572 | -2.684590 | -3.045700 | 0.316050 | 0.172540 | -0.067255 | 0.144730 | 0.368222 | 0.612087 | 0.281742 | 0.359891 | 0.361189 | -0.143659 | 1.543276 | -1.623735 | -2.942837 | -1.976773 | -0.048578 | 2.700838 | -1.497923 | -0.605235 | 0.172391 | 1.715817 | -1.690991 | -2.798107 | -1.608551 | 0.563509 | 2.982581 | -1.138033 | -0.244046 | AL | Etowah County, AL | 3.0 | 2.0 | 1.0 | 48177.0 | 45380.0 | 2797.0 | 5.8 | 46764.0 | 43689.0 | 3075.0 | 6.6 | 46892.0 | 43899.0 | 2993.0 | 6.4 | 46388.0 | 43351.0 | 3037.0 | 6.5 | 47170.0 | 44143.0 | 3027.0 | 6.4 | 46762.0 | 44542.0 | 2220.0 | 4.7 | 46521.0 | 44546.0 | 1975.0 | 4.2 | 45459.0 | 43431.0 | 2028.0 | 4.5 | 45727.0 | 42995.0 | 2732.0 | 6.0 | 44993.0 | 39838.0 | 5155.0 | 11.5 | 44000.0 | 39193.0 | 4807.0 | 10.9 | 44722.0 | 40238.0 | 4484.0 | 10.0 | 44232.0 | 40589.0 | 3643.0 | 8.2 | 43862.0 | 40538.0 | 3324.0 | 7.6 | 43719.0 | 40639.0 | 3080.0 | 7.0 | 42767.0 | 40126.0 | 2641.0 | 6.2 | 43451.0 | 40892.0 | 2559.0 | 5.9 | 43233.0 | 41251.0 | 1982.0 | 4.6 | 43252.0 | 41502.0 | 1750.0 | 4.0 | 42723.0 | 41295.0 | 1428.0 | 3.3 | 43047.0 | 83.148865 | AL | Etowah County | 3.0 | 2.0 | 3.0 | 2.0 | 18801.0 | 16286.0 | 21316.0 | 18.8 | 16.3 | 21.3 | 5754.0 | 4606.0 | 6902.0 | 27.0 | 21.6 | 32.4 | 3945.0 | 2988.0 | 4902.0 | 25.6 | 19.4 | 31.8 | 43047.0 | 39460.0 | 46634.0 | 0.0 |
| 36 | 1073.0 | AL | Jefferson County | 1.0 | 1.0 | 1.0 | 1.0 | 186882.0 | 101656.0 | 32182.0 | 33827.0 | 52.7 | 28.7 | 9.1 | 9.5 | 141531.0 | 132921.0 | 63342.0 | 63229.0 | 35.3 | 33.1 | 15.8 | 15.8 | 111325.0 | 120393.0 | 109272.0 | 84718.0 | 26.2 | 28.3 | 25.7 | 19.9 | 82950.0 | 121233.0 | 123142.0 | 106833.0 | 19.1 | 27.9 | 28.4 | 24.6 | 44684.0 | 118636.0 | 136299.0 | 150124.0 | 9.935452 | 26.378620 | 30.305975 | 33.379951 | AL | Jefferson County | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 658466.0 | 658567.0 | 658215.0 | 658109.0 | 658061.0 | 659265.0 | 659972.0 | 660455.0 | 660343.0 | 659599.0 | 659429.0 | 658573.0 | -352.0 | -106.0 | -48.0 | 1204.0 | 707.0 | 483.0 | -112.0 | -744.0 | -170.0 | -856.0 | 2105.0 | 8831.0 | 9008.0 | 8838.0 | 8657.0 | 8994.0 | 8729.0 | 8531.0 | 8662.0 | 8422.0 | 1585.0 | 6895.0 | 6738.0 | 7057.0 | 6854.0 | 7212.0 | 7016.0 | 7201.0 | 7436.0 | 7343.0 | 520.0 | 1936.0 | 2270.0 | 1781.0 | 1803.0 | 1782.0 | 1713.0 | 1330.0 | 1226.0 | 1079.0 | 98.0 | 774.0 | 671.0 | 760.0 | 520.0 | 668.0 | 843.0 | 399.0 | 453.0 | 460.0 | -965.0 | -2812.0 | -2920.0 | -1226.0 | -1488.0 | -1896.0 | -2639.0 | -2441.0 | -1827.0 | -2375.0 | -867.0 | -2038.0 | -2249.0 | -466.0 | -968.0 | -1228.0 | -1796.0 | -2042.0 | -1374.0 | -1915.0 | -5.0 | -4.0 | -69.0 | -111.0 | -128.0 | -71.0 | -29.0 | -32.0 | -22.0 | -20.0 | 15774.0 | 15775.0 | 15753.0 | 15805.0 | 16009.0 | 16023.0 | 16360.0 | 16612.0 | 16696.0 | 16651.0 | 16663.0 | 13.417669 | 13.688201 | 13.418091 | 13.124253 | 13.622866 | 13.217767 | 12.926326 | 13.133914 | 12.779950 | 10.476144 | 10.238799 | 10.714128 | 10.390855 | 10.923739 | 10.623880 | 10.911085 | 11.274969 | 11.142623 | 2.941525 | 3.449402 | 2.703962 | 2.733398 | 2.699127 | 2.593886 | 2.015240 | 1.858945 | 1.637327 | 1.176002 | 1.019625 | 1.153853 | 0.788334 | 1.011794 | 1.276501 | 0.604572 | 0.686869 | 0.698026 | -4.272504 | -4.437117 | -1.861346 | -2.255849 | -2.871798 | -3.996069 | -3.698647 | -2.770222 | -3.603940 | -3.096502 | -3.417492 | -0.707494 | -1.467515 | -1.860004 | -2.719568 | -3.094075 | -2.083352 | -2.905914 | AL | Jefferson County, AL | 1.0 | 1.0 | 1.0 | 329015.0 | 315985.0 | 13030.0 | 4.0 | 325883.0 | 311802.0 | 14081.0 | 4.3 | 322122.0 | 305378.0 | 16744.0 | 5.2 | 321042.0 | 303282.0 | 17760.0 | 5.5 | 320336.0 | 303284.0 | 17052.0 | 5.3 | 317359.0 | 303260.0 | 14099.0 | 4.4 | 316750.0 | 304027.0 | 12723.0 | 4.0 | 314739.0 | 302603.0 | 12136.0 | 3.9 | 311615.0 | 294792.0 | 16823.0 | 5.4 | 307018.0 | 274011.0 | 33007.0 | 10.8 | 318520.0 | 285627.0 | 32893.0 | 10.3 | 319652.0 | 289916.0 | 29736.0 | 9.3 | 316511.0 | 293020.0 | 23491.0 | 7.4 | 315405.0 | 294426.0 | 20979.0 | 6.7 | 312131.0 | 292505.0 | 19626.0 | 6.3 | 311481.0 | 293544.0 | 17937.0 | 5.8 | 312478.0 | 294630.0 | 17848.0 | 5.7 | 311481.0 | 298227.0 | 13254.0 | 4.3 | 315793.0 | 304248.0 | 11545.0 | 3.7 | 318755.0 | 309664.0 | 9091.0 | 2.9 | 54127.0 | 104.550812 | AL | Jefferson County | 1.0 | 1.0 | 1.0 | 1.0 | 104288.0 | 95487.0 | 113089.0 | 16.2 | 14.8 | 17.6 | 33205.0 | 28713.0 | 37697.0 | 22.5 | 19.4 | 25.6 | 22124.0 | 18668.0 | 25580.0 | 20.9 | 17.6 | 24.2 | 54127.0 | 51853.0 | 56401.0 | 1.0 |
| 44 | 1089.0 | AL | Madison County | 2.0 | 2.0 | 2.0 | 2.0 | 34090.0 | 28255.0 | 12537.0 | 17289.0 | 37.0 | 30.7 | 13.6 | 18.8 | 32899.0 | 34718.0 | 18930.0 | 23506.0 | 29.9 | 31.5 | 17.2 | 21.4 | 30211.0 | 35243.0 | 41361.0 | 46049.0 | 19.8 | 23.1 | 27.1 | 30.1 | 26308.0 | 39591.0 | 52676.0 | 61814.0 | 14.6 | 21.9 | 29.2 | 34.3 | 20562.0 | 51126.0 | 70137.0 | 105935.0 | 8.299160 | 20.635292 | 28.308443 | 42.757103 | AL | Madison County | 2.0 | 2.0 | 2.0 | 2.0 | 4.0 | 334811.0 | 334807.0 | 336095.0 | 339565.0 | 342730.0 | 346625.0 | 349796.0 | 352977.0 | 356729.0 | 361762.0 | 367004.0 | 372909.0 | 1288.0 | 3470.0 | 3165.0 | 3895.0 | 3171.0 | 3181.0 | 3752.0 | 5033.0 | 5242.0 | 5905.0 | 967.0 | 4250.0 | 4033.0 | 3999.0 | 4193.0 | 4206.0 | 4191.0 | 4242.0 | 4254.0 | 4242.0 | 600.0 | 2715.0 | 2576.0 | 2892.0 | 2701.0 | 3042.0 | 3110.0 | 3261.0 | 3288.0 | 3305.0 | 367.0 | 1535.0 | 1457.0 | 1107.0 | 1492.0 | 1164.0 | 1081.0 | 981.0 | 966.0 | 937.0 | 100.0 | 513.0 | 648.0 | 457.0 | 402.0 | 542.0 | 610.0 | 362.0 | 450.0 | 230.0 | 790.0 | 1424.0 | 1095.0 | 2324.0 | 1319.0 | 1506.0 | 2067.0 | 3687.0 | 3817.0 | 4721.0 | 890.0 | 1937.0 | 1743.0 | 2781.0 | 1721.0 | 2048.0 | 2677.0 | 4049.0 | 4267.0 | 4951.0 | 31.0 | -2.0 | -35.0 | 7.0 | -42.0 | -31.0 | -6.0 | 3.0 | 9.0 | 17.0 | 8042.0 | 8046.0 | 7570.0 | 7530.0 | 7655.0 | 7778.0 | 8081.0 | 8102.0 | 8519.0 | 8937.0 | 8940.0 | 12.580292 | 11.821866 | 11.602150 | 12.041567 | 11.969726 | 11.810524 | 11.808081 | 11.674529 | 11.466213 | 8.036586 | 7.550986 | 8.390452 | 7.756802 | 8.657134 | 8.764192 | 9.077358 | 9.023473 | 8.933483 | 4.543705 | 4.270880 | 3.211698 | 4.284765 | 3.312592 | 3.046332 | 2.730723 | 2.651057 | 2.532730 | 1.518515 | 1.899472 | 1.325877 | 1.154474 | 1.542461 | 1.719022 | 1.007667 | 1.234964 | 0.621695 | 4.215138 | 3.209755 | 6.742535 | 3.787939 | 4.285879 | 5.824947 | 10.263177 | 10.475242 | 12.760960 | 5.733653 | 5.109227 | 8.068412 | 4.942413 | 5.828340 | 7.543969 | 11.270844 | 11.710206 | 13.382654 | AL | Madison County, AL | 2.0 | 2.0 | 1.0 | 148675.0 | 143415.0 | 5260.0 | 3.5 | 149678.0 | 143919.0 | 5759.0 | 3.8 | 149904.0 | 143021.0 | 6883.0 | 4.6 | 154255.0 | 146915.0 | 7340.0 | 4.8 | 157604.0 | 150260.0 | 7344.0 | 4.7 | 158767.0 | 153075.0 | 5692.0 | 3.6 | 161332.0 | 156018.0 | 5314.0 | 3.3 | 165286.0 | 160218.0 | 5068.0 | 3.1 | 168340.0 | 161326.0 | 7014.0 | 4.2 | 168887.0 | 155310.0 | 13577.0 | 8.0 | 169833.0 | 155656.0 | 14177.0 | 8.3 | 171726.0 | 157512.0 | 14214.0 | 8.3 | 170209.0 | 158399.0 | 11810.0 | 6.9 | 171363.0 | 160551.0 | 10812.0 | 6.3 | 170070.0 | 159573.0 | 10497.0 | 6.2 | 171217.0 | 161839.0 | 9378.0 | 5.5 | 174275.0 | 165388.0 | 8887.0 | 5.1 | 176562.0 | 169711.0 | 6851.0 | 3.9 | 181565.0 | 175352.0 | 6213.0 | 3.4 | 185219.0 | 180505.0 | 4714.0 | 2.5 | 68609.0 | 132.524002 | AL | Madison County | 2.0 | 2.0 | 2.0 | 2.0 | 41789.0 | 37046.0 | 46532.0 | 11.5 | 10.2 | 12.8 | 13373.0 | 11180.0 | 15566.0 | 16.8 | 14.0 | 19.6 | 8528.0 | 6722.0 | 10334.0 | 14.7 | 11.6 | 17.8 | 68609.0 | 64841.0 | 72377.0 | 0.0 |
| 48 | 1097.0 | AL | Mobile County | 2.0 | 2.0 | 2.0 | 2.0 | 92179.0 | 43792.0 | 12675.0 | 12072.0 | 57.4 | 27.2 | 7.9 | 7.5 | 77046.0 | 69871.0 | 29075.0 | 24926.0 | 38.3 | 34.8 | 14.5 | 12.4 | 69340.0 | 75114.0 | 51722.0 | 36078.0 | 29.9 | 32.3 | 22.3 | 15.5 | 58223.0 | 79822.0 | 65452.0 | 46625.0 | 23.3 | 31.9 | 26.2 | 18.6 | 37736.0 | 94715.0 | 81698.0 | 64689.0 | 13.533306 | 33.967751 | 29.299450 | 23.199492 | AL | Mobile County | 2.0 | 2.0 | 2.0 | 2.0 | 0.0 | 412992.0 | 413139.0 | 413315.0 | 413068.0 | 413816.0 | 413918.0 | 414149.0 | 414462.0 | 414945.0 | 414045.0 | 413908.0 | 413210.0 | 176.0 | -247.0 | 748.0 | 102.0 | 231.0 | 313.0 | 483.0 | -900.0 | -137.0 | -698.0 | 1345.0 | 5607.0 | 5677.0 | 5392.0 | 5654.0 | 5693.0 | 5588.0 | 5585.0 | 5543.0 | 5443.0 | 949.0 | 4130.0 | 4051.0 | 4375.0 | 4284.0 | 4148.0 | 4331.0 | 4434.0 | 4429.0 | 4576.0 | 396.0 | 1477.0 | 1626.0 | 1017.0 | 1370.0 | 1545.0 | 1257.0 | 1151.0 | 1114.0 | 867.0 | 122.0 | 597.0 | 688.0 | 588.0 | 457.0 | 567.0 | 605.0 | 385.0 | 440.0 | 266.0 | -314.0 | -2321.0 | -1514.0 | -1467.0 | -1548.0 | -1766.0 | -1363.0 | -2431.0 | -1677.0 | -1821.0 | -192.0 | -1724.0 | -826.0 | -879.0 | -1091.0 | -1199.0 | -758.0 | -2046.0 | -1237.0 | -1555.0 | -28.0 | 0.0 | -52.0 | -36.0 | -48.0 | -33.0 | -16.0 | -5.0 | -14.0 | -10.0 | 6949.0 | 6949.0 | 6977.0 | 6982.0 | 7007.0 | 7064.0 | 7062.0 | 6965.0 | 6922.0 | 7390.0 | 7390.0 | 13.569979 | 13.731067 | 13.028340 | 13.655900 | 13.741068 | 13.474687 | 13.474228 | 13.389649 | 13.161363 | 9.995365 | 9.798230 | 10.571029 | 10.346989 | 10.011936 | 10.443606 | 10.697355 | 10.698675 | 11.064927 | 3.574614 | 3.932837 | 2.457311 | 3.308911 | 3.729132 | 3.031081 | 2.776873 | 2.690974 | 2.096436 | 1.444851 | 1.664079 | 1.420746 | 1.103775 | 1.368555 | 1.458874 | 0.928841 | 1.062862 | 0.643197 | -5.617250 | -3.661940 | -3.544617 | -3.738828 | -4.262555 | -3.286686 | -5.864968 | -4.050955 | -4.403241 | -4.172399 | -1.997862 | -2.123871 | -2.635052 | -2.894000 | -1.827812 | -4.936127 | -2.988092 | -3.760044 | AL | Mobile County, AL | 2.0 | 2.0 | 1.0 | 184951.0 | 175846.0 | 9105.0 | 4.9 | 183762.0 | 173439.0 | 10323.0 | 5.6 | 181590.0 | 169842.0 | 11748.0 | 6.5 | 180242.0 | 167696.0 | 12546.0 | 7.0 | 178915.0 | 167153.0 | 11762.0 | 6.6 | 180384.0 | 171439.0 | 8945.0 | 5.0 | 182019.0 | 174428.0 | 7591.0 | 4.2 | 184490.0 | 177096.0 | 7394.0 | 4.0 | 187266.0 | 176623.0 | 10643.0 | 5.7 | 186825.0 | 165725.0 | 21100.0 | 11.3 | 192730.0 | 170943.0 | 21787.0 | 11.3 | 193851.0 | 172971.0 | 20880.0 | 10.8 | 188626.0 | 171312.0 | 17314.0 | 9.2 | 187046.0 | 171088.0 | 15958.0 | 8.5 | 185318.0 | 170959.0 | 14359.0 | 7.7 | 187264.0 | 174405.0 | 12859.0 | 6.9 | 188542.0 | 176094.0 | 12448.0 | 6.6 | 187560.0 | 177819.0 | 9741.0 | 5.2 | 189004.0 | 180293.0 | 8711.0 | 4.6 | 189107.0 | 182083.0 | 7024.0 | 3.7 | 49492.0 | 95.597923 | AL | Mobile County | 2.0 | 2.0 | 2.0 | 2.0 | 71784.0 | 63651.0 | 79917.0 | 17.7 | 15.7 | 19.7 | 25003.0 | 21090.0 | 28916.0 | 26.4 | 22.3 | 30.5 | 16366.0 | 13238.0 | 19494.0 | 24.1 | 19.5 | 28.7 | 49492.0 | 47745.0 | 51239.0 | 0.0 |
# fontion pour les remplacer par les limites haute ou basse de la plage interquartile
def replace_possible_outliers(df, target_column):
"""
Détecte les outliers dans chaque colonne numérique d'un DataFrame selon la méthode de l'IQR
et les remplace par les limites haute ou basse de la plage interquartile.
Inputs :
--------
- df : DataFrame
Le DataFrame contenant les données, y compris la variable cible.
- target_column : str
Le nom de la colonne représentant la variable cible.
Outputs :
---------
Modifie le DataFrame en place.
"""
for column in df.select_dtypes(include=np.number).columns:
# Ignorer la colonne de la variable cible
if column == target_column:
continue
# Calcul des quartiles et de l'IQR
Q1 = np.nanpercentile(df[column], 25)
Q3 = np.nanpercentile(df[column], 75)
IQR = Q3 - Q1
# Calcul des limites haute et basse
outer_fence = IQR * 1.5
outer_lower = Q1 - outer_fence
outer_upper = Q3 + outer_fence
# Détection des outliers potentiels
mask = (df[column] < outer_lower) | (df[column] > outer_upper)
# Remplacement des outliers par les limites de l'IQR
df.loc[mask, column] = np.where(df.loc[mask, column] > outer_upper, outer_upper, outer_lower)
# Traiter les outliers
replace_possible_outliers(data, "Target")
#création d'un mask pour selectionner les colonne numerique de mon dataset
mask = data.select_dtypes(exclude="object")
# Afficher un boxplot des variables pour detecter visuellement la presence d'un outlier
plt.figure(figsize=(30, 80))
sns.boxplot(data=mask, orient='h')
plt.title(' boxplot des variables')
plt.show()
# voir les variables catégorielles
mask = data.select_dtypes(include="object")
mask
| State_x | Area name | State_y | Area_Name | Stabr_x | area_name | Stabr_y | Area_name | |
|---|---|---|---|---|---|---|---|---|
| 0 | AL | Autauga County | AL | Autauga County | AL | Autauga County, AL | AL | Autauga County |
| 1 | AL | Baldwin County | AL | Baldwin County | AL | Baldwin County, AL | AL | Baldwin County |
| 2 | AL | Barbour County | AL | Barbour County | AL | Barbour County, AL | AL | Barbour County |
| 3 | AL | Bibb County | AL | Bibb County | AL | Bibb County, AL | AL | Bibb County |
| 4 | AL | Blount County | AL | Blount County | AL | Blount County, AL | AL | Blount County |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3107 | WY | Sweetwater County | WY | Sweetwater County | WY | Sweetwater County, WY | WY | Sweetwater County |
| 3108 | WY | Teton County | WY | Teton County | WY | Teton County, WY | WY | Teton County |
| 3109 | WY | Uinta County | WY | Uinta County | WY | Uinta County, WY | WY | Uinta County |
| 3110 | WY | Washakie County | WY | Washakie County | WY | Washakie County, WY | WY | Washakie County |
| 3111 | WY | Weston County | WY | Weston County | WY | Weston County, WY | WY | Weston County |
3112 rows × 8 columns
# supprimer les variables dupliquées
data.drop(["State_y","Area_Name","Stabr_x","area_name","Stabr_y","Area_name"], axis=1, inplace= True)
#fonction qui permet d'encoder toutes les variables catégorielles du DataFrame en faisant du label encoding
def encodage_df(x):
for i in x.select_dtypes(include=["object"]).columns:
x[i],_=pd.factorize(x[i])
return encodage_df(x)
#afficher le nouveau dataset
encodage_df(data)
# Séparer les features et la cible
y = data["Target"]
X = data.drop("Target", axis=1)
# Initialisez le scaler pour standardisez nos variables
sc = StandardScaler()
# Normalisez les données avec la méthode fit_transform
X_normalise = sc.fit_transform(X)
# Contrôlez les dimension de X_normalise
print(f"Les dimensions de nos données centrées réduites : {X_normalise.shape}")
Les dimensions de nos données centrées réduites : (3112, 319)
# Réduction de dimensionnalité avec l'ACP
pca = PCA(0.90)
education_pca = pca.fit_transform(X_normalise)
# Affichage de la variance expliquée par chaque composante
explained_variance_ratio = pca.explained_variance_ratio_
print(f'Variance expliquée par les composantes principales : {explained_variance_ratio}')
Variance expliquée par les composantes principales : [0.49030714 0.10563136 0.06874754 0.05183324 0.02958843 0.02305642 0.02080234 0.01349825 0.01197089 0.01158013 0.01038573 0.00898663 0.00872313 0.00745133 0.00671932 0.00554828 0.00534259 0.00505536 0.00484105 0.00471656 0.00439242 0.0040719 ]
# On affiche le cumul de la variance expliquée
for i in range(len(pca.explained_variance_ratio_)):
if i == 0 :
print(f"Le premier axe explique {sum(pca.explained_variance_ratio_[:i+1])*100:.2f}% de la variance totale")
else :
print(f"Les {i+1} premiers axe expliquent {sum(pca.explained_variance_ratio_[:i+1])*100:.2f}% de la variance totale")
Le premier axe explique 49.03% de la variance totale Les 2 premiers axe expliquent 59.59% de la variance totale Les 3 premiers axe expliquent 66.47% de la variance totale Les 4 premiers axe expliquent 71.65% de la variance totale Les 5 premiers axe expliquent 74.61% de la variance totale Les 6 premiers axe expliquent 76.92% de la variance totale Les 7 premiers axe expliquent 79.00% de la variance totale Les 8 premiers axe expliquent 80.35% de la variance totale Les 9 premiers axe expliquent 81.54% de la variance totale Les 10 premiers axe expliquent 82.70% de la variance totale Les 11 premiers axe expliquent 83.74% de la variance totale Les 12 premiers axe expliquent 84.64% de la variance totale Les 13 premiers axe expliquent 85.51% de la variance totale Les 14 premiers axe expliquent 86.26% de la variance totale Les 15 premiers axe expliquent 86.93% de la variance totale Les 16 premiers axe expliquent 87.48% de la variance totale Les 17 premiers axe expliquent 88.02% de la variance totale Les 18 premiers axe expliquent 88.52% de la variance totale Les 19 premiers axe expliquent 89.01% de la variance totale Les 20 premiers axe expliquent 89.48% de la variance totale Les 21 premiers axe expliquent 89.92% de la variance totale Les 22 premiers axe expliquent 90.33% de la variance totale
# On stocke la contribution des valeurs propres
exp_var_pca = pca.explained_variance_ratio_*100
# On calcule la somme cumulée des valeurs propres
cum_sum_eigenvalues = np.cumsum(exp_var_pca)
# Visualiser la contribution des composantes
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', color="#3b4859", label='Contribution à la variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, c='#ff7373', where='mid',label='Variance expliquée cumulée')
plt.ylabel("Pourcentage d'inertie")
plt.xlabel('Index de nos composantes')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
# Extraction des charges des variables pour chaque composante
loadings = pca.components_.T
# Affichage des charges des variables
loadings_df = pd.DataFrame(loadings, columns=[f'PC{i}' for i in range(1, pca.n_components_ + 1)], index=X.columns)
print('Charges des variables pour chaque composante principale:')
print(loadings_df)
Charges des variables pour chaque composante principale:
PC1 PC2 PC3 PC4 \
county_fips -0.001663 0.019645 -0.006769 0.005787
State_x -0.001350 0.018677 -0.007443 0.005454
Area name -0.000057 0.018420 0.013312 0.003054
2003 Rural-urban Continuum Code -0.059006 0.001097 0.021406 0.017422
2003 Urban Influence Code -0.052702 0.004409 0.033858 0.025513
... ... ... ... ...
CI90LB517P_2019 -0.015150 -0.128463 0.030369 -0.086595
CI90UB517P_2019 -0.032119 -0.118766 0.024873 -0.087386
MEDHHINC_2019 0.042618 0.110472 -0.014535 0.039022
CI90LBINC_2019 0.047394 0.103026 -0.015598 0.039361
CI90UBINC_2019 0.037768 0.116223 -0.013473 0.038311
PC5 PC6 PC7 PC8 \
county_fips -0.017674 -0.024428 -0.056556 -0.002025
State_x -0.018613 -0.025194 -0.057705 -0.001300
Area name -0.027131 -0.008079 -0.037508 0.007944
2003 Rural-urban Continuum Code -0.135318 -0.055541 0.119110 -0.026777
2003 Urban Influence Code -0.153254 -0.062182 0.147349 -0.021086
... ... ... ... ...
CI90LB517P_2019 -0.080434 -0.031377 -0.033736 0.055094
CI90UB517P_2019 -0.085817 -0.027501 -0.032289 0.053170
MEDHHINC_2019 0.086656 0.057277 0.011159 -0.034036
CI90LBINC_2019 0.082963 0.054386 0.012775 -0.035623
CI90UBINC_2019 0.089130 0.059822 0.009268 -0.032415
PC9 PC10 PC11 PC12 \
county_fips 0.146108 -0.026932 0.109607 -0.017356
State_x 0.145699 -0.028467 0.110187 -0.015763
Area name 0.108664 -0.022922 0.101310 0.009589
2003 Rural-urban Continuum Code 0.003996 -0.040967 -0.120032 0.002506
2003 Urban Influence Code 0.017888 -0.034360 -0.099967 0.021066
... ... ... ... ...
CI90LB517P_2019 -0.088010 -0.030645 0.071310 0.021117
CI90UB517P_2019 -0.084171 -0.038521 0.066208 0.028167
MEDHHINC_2019 0.097297 0.003692 -0.038357 0.063816
CI90LBINC_2019 0.094441 0.004836 -0.037363 0.058825
CI90UBINC_2019 0.098984 0.002003 -0.038877 0.068950
PC13 PC14 PC15 PC16 \
county_fips 0.417378 0.199928 -0.246540 -0.049930
State_x 0.419066 0.198969 -0.243861 -0.048137
Area name 0.331133 0.147777 -0.206022 -0.055225
2003 Rural-urban Continuum Code 0.037559 0.023418 0.004520 0.009103
2003 Urban Influence Code 0.026469 0.024051 0.023811 0.002813
... ... ... ... ...
CI90LB517P_2019 -0.013862 0.001317 0.032315 -0.058745
CI90UB517P_2019 -0.019450 -0.000960 0.029014 -0.051368
MEDHHINC_2019 0.022472 -0.012574 -0.010881 0.048001
CI90LBINC_2019 0.021717 -0.011387 -0.013981 0.044689
CI90UBINC_2019 0.022437 -0.013003 -0.007960 0.050575
PC17 PC18 PC19 PC20 \
county_fips -0.149108 0.021940 0.011615 -0.007992
State_x -0.147870 0.021922 0.012731 -0.007505
Area name -0.114420 0.019022 0.053170 0.009150
2003 Rural-urban Continuum Code 0.010397 0.002493 -0.022559 0.010825
2003 Urban Influence Code 0.014223 -0.009987 -0.007084 0.014637
... ... ... ... ...
CI90LB517P_2019 -0.056729 0.114841 -0.031638 -0.051949
CI90UB517P_2019 -0.067842 0.094914 -0.015314 -0.037751
MEDHHINC_2019 0.083480 -0.103188 0.024709 0.049296
CI90LBINC_2019 0.077791 -0.101046 0.022385 0.045577
CI90UBINC_2019 0.087531 -0.104662 0.026739 0.052315
PC21 PC22
county_fips -0.050358 -0.001895
State_x -0.048519 -0.001421
Area name -0.011853 -0.025245
2003 Rural-urban Continuum Code -0.002212 -0.002922
2003 Urban Influence Code 0.002748 0.012253
... ... ...
CI90LB517P_2019 0.032877 0.009231
CI90UB517P_2019 0.014150 0.000625
MEDHHINC_2019 -0.015326 0.005173
CI90LBINC_2019 -0.020932 0.001968
CI90UBINC_2019 -0.010707 0.007975
[319 rows x 22 columns]
# Identification des variables réduites
variables_reduites = X.columns[np.argmax(np.abs(loadings), axis=0)]
print('Variables réduites :', variables_reduites)
Variables réduites : Index(['Civilian_labor_force_2017', 'Unemployment_rate_2014', 'R_birth_2016',
'Percent of adults with a high school diploma only, 1970',
'2013 Urban Influence Code', 'R_birth_2016',
'Percent of adults completing some college or associate's degree, 2000',
'RESIDUAL_2016', 'R_DOMESTIC_MIG_2012', 'RESIDUAL_2016',
'Percent of adults completing some college (1-3 years), 1970',
'Percent of adults with a high school diploma only, 1990', 'State_x',
'R_DOMESTIC_MIG_2013', 'county_fips', 'R_DOMESTIC_MIG_2012',
'R_DOMESTIC_MIG_2011', 'NET_MIG_2010', 'R_NET_MIG_2016',
'R_NET_MIG_2014', 'NET_MIG_2010', 'R_NET_MIG_2015'],
dtype='object')
# Afficher les features avec les variables réduites
X=X[variables_reduites]
print(f"Le jeu de données a {X.shape[0]} lignes et {X.shape[1]} colonnes")
# Affichez les 5 premières lignes
X.head()
Le jeu de données a 3112 lignes et 22 colonnes
| Civilian_labor_force_2017 | Unemployment_rate_2014 | R_birth_2016 | Percent of adults with a high school diploma only, 1970 | 2013 Urban Influence Code | R_birth_2016 | Percent of adults completing some college or associate's degree, 2000 | RESIDUAL_2016 | R_DOMESTIC_MIG_2012 | RESIDUAL_2016 | Percent of adults completing some college (1-3 years), 1970 | Percent of adults with a high school diploma only, 1990 | State_x | R_DOMESTIC_MIG_2013 | county_fips | R_DOMESTIC_MIG_2012 | R_DOMESTIC_MIG_2011 | NET_MIG_2010 | R_NET_MIG_2016 | R_NET_MIG_2014 | NET_MIG_2010 | R_NET_MIG_2015 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 26075.000 | 5.8 | 12.097324 | 31.1 | 2.0 | 12.097324 | 26.9 | -3.0 | -5.971992 | -3.0 | 7.7 | 32.0 | 0 | -4.121042 | 1001.0 | -5.971992 | 5.945455 | 123.5 | 4.777171 | 1.970443 | 123.5 | -1.712875 |
| 1 | 71882.375 | 6.1 | 11.204755 | 26.7 | 2.0 | 11.204755 | 29.3 | 5.5 | 16.219674 | 5.5 | 7.4 | 31.8 | 0 | 18.006050 | 1003.0 | 16.219674 | 15.684259 | 123.5 | 21.279291 | 20.115574 | 123.5 | 17.725964 |
| 2 | 8349.000 | 10.5 | 10.597247 | 19.0 | 6.0 | 10.597247 | 21.3 | 2.0 | -6.457531 | 2.0 | 4.9 | 27.1 | 0 | -7.762540 | 1005.0 | -6.457531 | 0.475598 | -70.0 | -18.238016 | -5.140431 | -70.0 | -15.724575 |
| 3 | 8583.000 | 7.2 | 12.225372 | 19.4 | 1.0 | 12.225372 | 20.4 | 1.0 | -4.624328 | 1.0 | 3.3 | 33.8 | 0 | -6.683190 | 1007.0 | -4.624328 | -5.436808 | -59.0 | -0.708717 | 1.331144 | -59.0 | 1.329817 |
| 4 | 24822.000 | 6.1 | 12.171796 | 22.7 | 1.0 | 12.171796 | 24.8 | -2.0 | -1.737016 | -2.0 | 4.1 | 34.7 | 0 | -1.128482 | 1009.0 | -1.737016 | 0.487228 | 7.0 | -1.391062 | -2.049590 | 7.0 | -1.338525 |
# Calcul de la matrice de corrélation entre les variables
correlation_matrix = X.corr()
# Graphique de matrice de corrélation
plt.figure(figsize=(15,15))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Matrice de Corrélation entre les Variables')
plt.show()
# Vérifier les variables dupliquées
duplicated_variables = X.columns[X.columns.duplicated()]
if len(duplicated_variables) > 0:
print("Variables dupliquées trouvées:")
print(duplicated_variables)
else:
print("Aucune variable dupliquée trouvée.")
Variables dupliquées trouvées: Index(['R_birth_2016', 'RESIDUAL_2016', 'R_DOMESTIC_MIG_2012', 'NET_MIG_2010'], dtype='object')
# supprimer les variables dupliquées
X.drop(duplicated_variables, axis=1, inplace=True)
# Calcul de la matrice de corrélation entre les variables
correlation_matrix = X.corr()
# Afficher les paires de variables fortement corrélées
threshold = 0.8 # Choisissez le seuil de corrélation
correlated_pairs = []
for i in range(len(correlation_matrix.columns)):
for j in range(i+1, len(correlation_matrix.columns)):
if abs(correlation_matrix.iloc[i, j]) > threshold:
pair = (correlation_matrix.columns[i], correlation_matrix.columns[j], correlation_matrix.iloc[i, j])
correlated_pairs.append(pair)
if len(correlated_pairs) > 0:
print("Paires de variables fortement corrélées:")
for pair in correlated_pairs:
print(f"{pair[0]} et {pair[1]} : {pair[2]}")
else:
print("Aucune paire de variables fortement corrélées trouvée avec un seuil de corrélation de", threshold)
Paires de variables fortement corrélées: State_x et county_fips : 0.9992542465269411
# supprimer la colonne "State_x"
X.drop(["State_x"],axis=1, inplace=True)
print("variable supprimer avec succes")
variable supprimer avec succes
# split des données en train et test
set_seed = 1204
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=set_seed)
# Définir et instancier le classifieur à entraîner (ici, une régression logistique)
estimator = LogisticRegression(random_state=set_seed)
# Instanciation de la RFECV()
rfecv_selector = RFECV(estimator=estimator,
min_features_to_select=10,
scoring='f1',
n_jobs=-1,
step=1,
cv=5
)
# Entraînez le modèle
rfecv_selector.fit(X_train, y_train)
RFECV(cv=5, estimator=LogisticRegression(random_state=1204),
min_features_to_select=10, n_jobs=-1, scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RFECV(cv=5, estimator=LogisticRegression(random_state=1204),
min_features_to_select=10, n_jobs=-1, scoring='f1')LogisticRegression(random_state=1204)
LogisticRegression(random_state=1204)
# Représentation graphique AUC en fonction du nombre de variables, après RFECV.
mean_test_roc_auc = rfecv_selector.cv_results_["mean_test_score"]
n_features_selected = rfecv_selector.n_features_
min_features_to_select = rfecv_selector.min_features_to_select
n_features_in_rfecv = rfecv_selector.n_features_in_
fig = px.line(y=mean_test_roc_auc,
x=range(min_features_to_select, min_features_to_select + len(mean_test_roc_auc)),
labels={
"x": "Nombre de variables sélectionnées",
"y": "Score moyen de cross-validation"
},
title=f"Résultats RFECV avec {rfecv_selector.cv} k-fold"
)
fig.add_vline(x=n_features_selected, line_width=3, line_dash="dash", line_color="#3b4859")
fig.add_vrect(x0=30, x1=n_features_in_rfecv, line_width=0, fillcolor="#ff7373", opacity=0.2)
fig.show()
# Récupérez l'ensemble des variables issues de RFECV et transformer là en liste
rfe_features = rfecv_selector.get_feature_names_out().tolist()
print(f"Variables sélectionnées avec une performance élevée: :\n{rfe_features}\n")
Variables sélectionnées avec une performance élevée: : ['Civilian_labor_force_2017', 'Unemployment_rate_2014', 'Percent of adults with a high school diploma only, 1970', '2013 Urban Influence Code', "Percent of adults completing some college or associate's degree, 2000", 'Percent of adults completing some college (1-3 years), 1970', 'Percent of adults with a high school diploma only, 1990', 'R_DOMESTIC_MIG_2013', 'R_DOMESTIC_MIG_2011', 'R_NET_MIG_2016', 'R_NET_MIG_2014', 'R_NET_MIG_2015']
# Afficher les Variables pré-sélecttionnées
X=X[rfe_features]
print(f"Le jeu de données a {X.shape[0]} lignes et {X.shape[1]} colonnes")
# Affichez les 5 premières lignes
X.head()
Le jeu de données a 3112 lignes et 12 colonnes
| Civilian_labor_force_2017 | Unemployment_rate_2014 | Percent of adults with a high school diploma only, 1970 | 2013 Urban Influence Code | Percent of adults completing some college or associate's degree, 2000 | Percent of adults completing some college (1-3 years), 1970 | Percent of adults with a high school diploma only, 1990 | R_DOMESTIC_MIG_2013 | R_DOMESTIC_MIG_2011 | R_NET_MIG_2016 | R_NET_MIG_2014 | R_NET_MIG_2015 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 26075.000 | 5.8 | 31.1 | 2.0 | 26.9 | 7.7 | 32.0 | -4.121042 | 5.945455 | 4.777171 | 1.970443 | -1.712875 |
| 1 | 71882.375 | 6.1 | 26.7 | 2.0 | 29.3 | 7.4 | 31.8 | 18.006050 | 15.684259 | 21.279291 | 20.115574 | 17.725964 |
| 2 | 8349.000 | 10.5 | 19.0 | 6.0 | 21.3 | 4.9 | 27.1 | -7.762540 | 0.475598 | -18.238016 | -5.140431 | -15.724575 |
| 3 | 8583.000 | 7.2 | 19.4 | 1.0 | 20.4 | 3.3 | 33.8 | -6.683190 | -5.436808 | -0.708717 | 1.331144 | 1.329817 |
| 4 | 24822.000 | 6.1 | 22.7 | 1.0 | 24.8 | 4.1 | 34.7 | -1.128482 | 0.487228 | -1.391062 | -2.049590 | -1.338525 |
# voir la description statistique pour le sous ensemble de donnees
X.describe()
| Civilian_labor_force_2017 | Unemployment_rate_2014 | Percent of adults with a high school diploma only, 1970 | 2013 Urban Influence Code | Percent of adults completing some college or associate's degree, 2000 | Percent of adults completing some college (1-3 years), 1970 | Percent of adults with a high school diploma only, 1990 | R_DOMESTIC_MIG_2013 | R_DOMESTIC_MIG_2011 | R_NET_MIG_2016 | R_NET_MIG_2014 | R_NET_MIG_2015 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3112.000000 | 3112.000000 | 3112.000000 | 3112.000000 | 3112.000000 | 3112.000000 | 3112.000000 | 3112.000000 | 3112.000000 | 3112.000000 | 3112.000000 | 3112.000000 |
| mean | 22923.228591 | 6.179242 | 28.409030 | 5.224293 | 26.127988 | 8.903583 | 34.300739 | -1.733496 | -1.850067 | -0.101350 | -0.904376 | -0.561907 |
| std | 24293.472284 | 2.099572 | 8.122363 | 3.472972 | 5.627673 | 3.552670 | 6.118798 | 8.630109 | 7.732892 | 10.504784 | 8.949476 | 9.550746 |
| min | 100.000000 | 1.200000 | 5.800000 | 1.000000 | 10.100000 | 0.000000 | 18.000000 | -21.516192 | -19.562411 | -24.510364 | -22.024474 | -22.544480 |
| 25% | 5003.000000 | 4.600000 | 21.900000 | 2.000000 | 22.100000 | 6.100000 | 30.300000 | -6.695351 | -6.246028 | -6.231558 | -6.221956 | -6.005191 |
| 50% | 11653.500000 | 6.000000 | 29.400000 | 5.000000 | 26.200000 | 8.600000 | 33.800000 | -1.954317 | -1.842191 | -0.673901 | -1.153278 | -1.007086 |
| 75% | 31754.750000 | 7.500000 | 35.000000 | 8.000000 | 30.100000 | 11.200000 | 38.500000 | 3.185209 | 2.631560 | 5.954312 | 4.313056 | 5.021001 |
| max | 71882.375000 | 11.850000 | 47.900000 | 12.000000 | 42.100000 | 18.850000 | 50.800000 | 18.006050 | 15.947943 | 24.233118 | 20.115574 | 21.560290 |
#afficher la representation graphique de toutes les variables quantitative
quantitative_columns=X.select_dtypes(exclude=object).columns
for var in quantitative_columns:
X[var].hist()
plt.title('Histogramme de la variable ' + var )
plt.xlabel(var)
plt.ylabel('Frequency')
plt.show()
# Afficher les graphiques deux à deux
sns.pairplot(X)
plt.show()
# Affichez les dimensions du jeu de données historique_presidentielle
print(f"Le jeu de données a {historique_presidentielle.shape[0]} lignes et {historique_presidentielle.shape[1]} colonnes")
# Affichez les 5 premières lignes
historique_presidentielle.head()
Le jeu de données a 3112 lignes et 14 colonnes
| fips_code | county | total_2008 | dem_2008 | gop_2008 | oth_2008 | total_2012 | dem_2012 | gop_2012 | oth_2012 | total_2016 | dem_2016 | gop_2016 | oth_2016 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 26041 | Delta County | 19064 | 9974 | 8763 | 327 | 18043 | 8330 | 9533 | 180 | 18467 | 6431 | 11112 | 924 |
| 1 | 48295 | Lipscomb County | 1256 | 155 | 1093 | 8 | 1168 | 119 | 1044 | 5 | 1322 | 135 | 1159 | 28 |
| 2 | 1127 | Walker County | 28652 | 7420 | 20722 | 510 | 28497 | 6551 | 21633 | 313 | 29243 | 4486 | 24208 | 549 |
| 3 | 48389 | Reeves County | 3077 | 1606 | 1445 | 26 | 2867 | 1649 | 1185 | 33 | 3184 | 1659 | 1417 | 108 |
| 4 | 56017 | Hot Springs County | 2546 | 619 | 1834 | 93 | 2495 | 523 | 1894 | 78 | 2535 | 400 | 1939 | 196 |
#afficher les info du dataset
historique_presidentielle.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3112 entries, 0 to 3111 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fips_code 3112 non-null int64 1 county 3112 non-null object 2 total_2008 3112 non-null int64 3 dem_2008 3112 non-null int64 4 gop_2008 3112 non-null int64 5 oth_2008 3112 non-null int64 6 total_2012 3112 non-null int64 7 dem_2012 3112 non-null int64 8 gop_2012 3112 non-null int64 9 oth_2012 3112 non-null int64 10 total_2016 3112 non-null int64 11 dem_2016 3112 non-null int64 12 gop_2016 3112 non-null int64 13 oth_2016 3112 non-null int64 dtypes: int64(13), object(1) memory usage: 340.5+ KB
# fonction permettant d'avoir d'avoir les resultas pour chaque année de 2008 à 2016 dans une variable
def add_column(data, dem_col, gop_col, new_col_name):
"""
Ajoute une nouvelle colonne à un DataFrame donné en utilisant une fonction de recodage
basée sur les colonnes spécifiées contenant les votes démocrates et républicains.
Args:
data (pandas.DataFrame): Le DataFrame auquel ajouter la nouvelle colonne.
dem_col (str): Le nom de la colonne contenant les votes démocrates.
gop_col (str): Le nom de la colonne contenant les votes républicains.
new_col_name (str, optional): Le nom de la nouvelle colonne à créer.
Returns:
pandas.DataFrame: Le DataFrame modifié avec la nouvelle colonne ajoutée.
"""
data[new_col_name] = data.apply(lambda row: recodage(row[dem_col], row[gop_col]), axis=1)
return data
# Utilisation de la fonction avec les noms de colonnes spécifiés
historique_presidentielle = add_column(historique_presidentielle,"dem_2008", "gop_2008", "resulat_2008")
historique_presidentielle = add_column(historique_presidentielle,"dem_2012", "gop_2012", "resulat_2012" )
historique_presidentielle = add_column(historique_presidentielle,"dem_2016", "gop_2016", "resulat_2016" )
# selection des variables créés
historique_presidentielle=historique_presidentielle[["resulat_2008","resulat_2012","resulat_2016"]]
historique_presidentielle.head()
| resulat_2008 | resulat_2012 | resulat_2016 | |
|---|---|---|---|
| 0 | 1 | 0 | 0 |
| 1 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 |
| 3 | 1 | 1 | 1 |
| 4 | 0 | 0 | 0 |
def plot_countplots(data):
"""
Affiche les countplots de toutes les variables d'un DataFrame donné avec des grilles.
Args:
data (pandas.DataFrame): Le DataFrame contenant les variables à visualiser.
"""
# Définir le style de grille
sns.set_style("whitegrid")
# Récupérer les noms de toutes les colonnes du DataFrame
columns = data.columns
# Calculer le nombre de lignes et de colonnes pour la disposition des subplots
n_cols = 3 # Nombre de subplots par ligne
n_rows = (len(columns) - 1) // n_cols + 1 # Calcul du nombre de lignes nécessaires
# Créer la disposition des subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
axes = axes.flatten()
# Parcourir chaque colonne du DataFrame et tracer le countplot correspondant
for i, col in enumerate(columns):
ax = sns.countplot(data=data, x=col, ax=axes[i])
ax.set_title("Vote populaire aux élections")
# Définir les nouvelles étiquettes de l'axe des abscisses
new_labels = ['Démocrate', 'Républicain']
ax.set_xticklabels(new_labels)
# Afficher les subplots
plt.tight_layout()
plt.show()
# Utilisation de la fonction pour afficher les countplots de toutes les variables
plot_countplots(historique_presidentielle)
# Afficher le nombre de valeurs des catégories de la variable cible
sns.countplot(data, x="Target")
plt.title("Vote populaire aux élections de 2020")
# Maintenant, modifions les étiquettes de l'axe des abscisses
n_categories = len(data['Target'].unique())
# Définir les nouvelles étiquettes de l'axe des abscisses
new_labels = ['Démocrate', 'Républicain']
# Modifier les étiquettes de l'axe des abscisses
plt.xticks(range(n_categories), new_labels)
# Afficher à nouveau le graphique avec les nouvelles étiquettes
plt.show()
Le parti Démocrate a remporté le vote populaire aux élections présidentielles américaines de 2008, 2012 et 2020. En 2016, Hillary Clinton, la candidate Démocrate, a remporté le vote populaire mais a perdu l'élection face à Donald Trump.
#Affichez les features
X.head()
| Civilian_labor_force_2017 | Unemployment_rate_2014 | Percent of adults with a high school diploma only, 1970 | 2013 Urban Influence Code | Percent of adults completing some college or associate's degree, 2000 | Percent of adults completing some college (1-3 years), 1970 | Percent of adults with a high school diploma only, 1990 | R_DOMESTIC_MIG_2013 | R_DOMESTIC_MIG_2011 | R_NET_MIG_2016 | R_NET_MIG_2014 | R_NET_MIG_2015 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 26075.000 | 5.8 | 31.1 | 2.0 | 26.9 | 7.7 | 32.0 | -4.121042 | 5.945455 | 4.777171 | 1.970443 | -1.712875 |
| 1 | 71882.375 | 6.1 | 26.7 | 2.0 | 29.3 | 7.4 | 31.8 | 18.006050 | 15.684259 | 21.279291 | 20.115574 | 17.725964 |
| 2 | 8349.000 | 10.5 | 19.0 | 6.0 | 21.3 | 4.9 | 27.1 | -7.762540 | 0.475598 | -18.238016 | -5.140431 | -15.724575 |
| 3 | 8583.000 | 7.2 | 19.4 | 1.0 | 20.4 | 3.3 | 33.8 | -6.683190 | -5.436808 | -0.708717 | 1.331144 | 1.329817 |
| 4 | 24822.000 | 6.1 | 22.7 | 1.0 | 24.8 | 4.1 | 34.7 | -1.128482 | 0.487228 | -1.391062 | -2.049590 | -1.338525 |
# affichez la target
y.value_counts(normalize=True)
Target 0.0 0.827121 1.0 0.172879 Name: proportion, dtype: float64
La variable cible est déséquilibrée. il faut gérer le désequilibre, dans notre cas on va surechantilloner notre classe minoritaire
# Divisez vos données en ensembles d'entraînement et de test
set_seed=1204
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=set_seed)
# Affichez les tailles des ensembles résultants
print("Taille de l'ensemble d'entraînement :", len(X_train))
print("Taille de l'ensemble de test :", len(X_test))
Taille de l'ensemble d'entraînement : 2489 Taille de l'ensemble de test : 623
# Fonction de suréchantillonnage de la classe minoritaire de mon dataset
def oversample_minority_class(X_train, y_train):
# Concaténer X_train et y_train pour faciliter le suréchantillonnage
train_data = pd.concat([X_train, y_train], axis=1)
# Trouver la classe minoritaire
class_counts = y_train.value_counts()
minority_class_label = class_counts.idxmin()
# Séparer les classes majoritaire et minoritaire
majority_class = train_data[train_data[y_train.name] != minority_class_label]
minority_class = train_data[train_data[y_train.name] == minority_class_label]
# Déterminer le nombre d'échantillons de la classe majoritaire et minoritaire
num_majority_samples = len(majority_class)
num_minority_samples = len(minority_class)
# Suréchantillonnage aléatoire de la classe minoritaire
oversampled_minority = minority_class.sample(n=num_majority_samples, replace=True, random_state=set_seed)
# Concaténer les échantillons suréchantillonnés avec les échantillons de la classe majoritaire
oversampled_data = pd.concat([majority_class, oversampled_minority], axis=0)
# Mélanger les données
oversampled_data = oversampled_data.sample(frac=1, random_state=set_seed).reset_index(drop=True)
# Séparer les features et les labels
X_train_oversampled = oversampled_data.drop(columns=[y_train.name])
y_train_oversampled = oversampled_data[y_train.name]
return X_train_oversampled, y_train_oversampled
# Affichage des premières lignes du DataFrame avant suréchantillonnage
print("Avant suréchantillonnage:")
print(y_train.value_counts())
# Appliquer la fonction de suréchantillonnage
X_oversampled, y_oversampled = oversample_minority_class(X_train, y_train)
# Afficher les premières lignes du DataFrame suréchantillonné
print("\nAprès suréchantillonnage:")
print(y_oversampled.value_counts())
Avant suréchantillonnage: Target 0.0 2056 1.0 433 Name: count, dtype: int64 Après suréchantillonnage: Target 0.0 2056 1.0 2056 Name: count, dtype: int64
# Maintenant, divisez ensembles d'entraînement et de validation
X_train, X_val, y_train, y_val = train_test_split(X_oversampled, y_oversampled, test_size=0.2, random_state=set_seed)
# Affichez les tailles des ensembles résultants
print("Taille de l'ensemble d'entraînement :", len(X_train))
print("Taille de l'ensemble de validation :", len(X_val))
print("Taille de l'ensemble de test :", len(X_test))
Taille de l'ensemble d'entraînement : 3289 Taille de l'ensemble de validation : 823 Taille de l'ensemble de test : 623
# Définition des modèles
models = {
"Logistic Regression": LogisticRegression(random_state=set_seed),
"Linear SVM": SVC(kernel='linear', random_state=set_seed),
"SVM with polynomial kernel": SVC(kernel='poly', random_state=set_seed),
"Decision Tree": DecisionTreeClassifier(random_state=set_seed),
"Random Forest": RandomForestClassifier(random_state=set_seed),
"Gradient Boosting": GradientBoostingClassifier(random_state=set_seed)
}
# Initialisation de la pipeline pour chaque modèle
pipelines = {}
for name, model in models.items():
pipelines[name] = Pipeline([
('scaler', StandardScaler()), # Standardisation des features
('model', model) # Modèle spécifique
])
# Entraînement et évaluation de chaque modèle
for name, pipeline in pipelines.items():
print("Modèle :", name)
# Entraînement du modèle
pipeline.fit(X_train, y_train)
# Prédiction sur l'ensemble de test
y_pred = pipeline.predict(X_test)
# Évaluation du modèle
f1 = f1_score(y_test, y_pred, average="micro")
print("Score F1 sur l'ensemble de test:", f1)
print("\n")
Modèle : Logistic Regression Score F1 sur l'ensemble de test: 0.7897271268057785 Modèle : Linear SVM Score F1 sur l'ensemble de test: 0.8073836276083467 Modèle : SVM with polynomial kernel Score F1 sur l'ensemble de test: 0.8459069020866774 Modèle : Decision Tree Score F1 sur l'ensemble de test: 0.8426966292134831 Modèle : Random Forest Score F1 sur l'ensemble de test: 0.8892455858747994 Modèle : Gradient Boosting Score F1 sur l'ensemble de test: 0.8571428571428571
Les modèles les plus performants appartenant à différentes "familles" sont la régression logistique, le SVM avec noyau polynomial, la forêt aléatoire et le gradient boosting.
# Définition des grilles de recherche d'hyperparamètres pour chaque modèle
param_grids = {
"Logistic Regression": {
'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
'classifier__penalty': ['l1', 'l2'],
},
"SVM with polynomial kernel": {
'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
'classifier__degree': [2, 3, 4],
},
"Random Forest": {
'classifier__n_estimators': [100, 200, 300],
'classifier__max_depth': [None, 10, 20, 30],
},
"Gradient Boosting": {
"classifier__n_estimators": [50, 100, 200, 500],
'classifier__learning_rate': [0.01, 0.1, 1, 10],
}
}
# Initialisation des modèles avec les pipelines
models_to_tune = {
"Logistic Regression": Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression(max_iter=1000, random_state=set_seed))
]),
"SVM with polynomial kernel": Pipeline([
('scaler', StandardScaler()),
('classifier', SVC(kernel="poly", random_state=set_seed))
]),
"Random Forest": Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(random_state=set_seed))
]),
"Gradient Boosting": Pipeline([
('scaler', StandardScaler()),
('classifier', GradientBoostingClassifier(random_state=set_seed))
])
}
# Recherche des meilleurs hyperparamètres pour chaque modèle
for model_name, model in models_to_tune.items():
print(f"Tuning hyperparameters for {model_name}...")
param_grid = param_grids[model_name]
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=1, scoring='f1_micro')
grid_search.fit(X_val, y_val)
print("Best parameters found:")
print(grid_search.best_params_)
print("Best score found:")
print(grid_search.best_score_)
print('\n')
Tuning hyperparameters for Logistic Regression...
Best parameters found:
{'classifier__C': 0.1, 'classifier__penalty': 'l2'}
Best score found:
0.80189948263119
Tuning hyperparameters for SVM with polynomial kernel...
Best parameters found:
{'classifier__C': 1, 'classifier__degree': 3}
Best score found:
0.8310790835181081
Tuning hyperparameters for Random Forest...
Best parameters found:
{'classifier__max_depth': None, 'classifier__n_estimators': 300}
Best score found:
0.8809386548410938
Tuning hyperparameters for Gradient Boosting...
Best parameters found:
{'classifier__learning_rate': 0.1, 'classifier__n_estimators': 500}
Best score found:
0.8736363636363637
Le meilleur modèle sur les données de validation est le Random Forest avec 0.8809386548410938
# Créer une pipeline pour utiliser les meilleurs hyperparamètres trouvés le Random Forest
pipeline_rf = Pipeline([
('scaler', StandardScaler()), # Étape de standardisation
('random_forest', RandomForestClassifier(max_depth=None, n_estimators=300, random_state=set_seed))
])
# Fusionnez l'ensemble d'entraînement initial et l'ensemble de validation pour les features
X_train_full = pd.concat([X_train, X_val], axis=0, ignore_index=True)
# Fusionnez l'ensemble d'entraînement initial et l'ensemble de validation pour les labels
y_train_full = pd.concat([y_train, y_val], axis=0, ignore_index=True)
# Entraîner le modèle sur l'ensemble d'entraînement complet
pipeline_rf.fit(X_train_full, y_train_full)
# Prédire les étiquettes sur l'ensemble de test
y_pred_rf = pipeline_rf.predict(X_test)
# Évaluer les performances du modèle sur l'ensemble de test avec le score F1
f1_rf = f1_score(y_test, y_pred_rf, average="micro")
print("Score F1 sur l'ensemble de test avec Random Forest:", f1_rf)
# Afficher le rapport de classification
print("Rapport de classification :")
print(classification_report(y_test, y_pred_rf))
Score F1 sur l'ensemble de test avec Random Forest: 0.8892455858747994
Rapport de classification :
precision recall f1-score support
0.0 0.91 0.96 0.94 518
1.0 0.74 0.53 0.62 105
accuracy 0.89 623
macro avg 0.82 0.75 0.78 623
weighted avg 0.88 0.89 0.88 623
# Calcul des probabilités prédites pour la classe positive (classe 1)
y_score_rf = pipeline_rf.predict_proba(X_test)[:, 1]
# Calcul des taux de faux positifs (FPR) et des taux de vrais positifs (TPR) pour la courbe ROC
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_score_rf)
# Calcul de l'aire sous la courbe ROC (AUC)
roc_auc_rf = auc(fpr_rf, tpr_rf)
# Tracé de la courbe ROC
plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, color='darkorange', lw=2, label='Courbe ROC (AUC = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taux de faux positifs')
plt.ylabel('Taux de vrais positifs')
plt.title('Courbe ROC pour Random Forest')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
from sklearn.metrics import confusion_matrix
# Obtenir la matrice de confusion
conf_matrix = confusion_matrix(y_test, y_pred)
# Créer un heatmap avec Seaborn
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
# Ajouter des étiquettes et un titre
plt.xlabel('Valeurs prédites')
plt.ylabel('Valeurs réelles')
plt.title('Matrice de confusion')
# Afficher le plot
plt.show()
# Calculer l'importance de permutation des variables pour le meilleur modèle
result = permutation_importance(pipeline, X_test, y_test, n_repeats=30, random_state=set_seed)
# Créer un DataFrame avec les variables et leurs importances de permutation
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': result.importances_mean})
# Trier le DataFrame par importance de permutation
importance_df = importance_df.sort_values(by='Importance', ascending=False)
# Réinitialiser les index du DataFrame
importance_df = importance_df.reset_index(drop=True)
# Afficher le DataFrame avec les variables et leurs importances de permutation
print(importance_df)
Feature Importance 0 Civilian_labor_force_2017 0.054521 1 Percent of adults with a high school diploma o... 0.051953 2 Percent of adults completing some college (1-3... 0.029909 3 Percent of adults completing some college or a... 0.019048 4 2013 Urban Influence Code 0.010219 5 Unemployment_rate_2014 0.009470 6 Percent of adults with a high school diploma o... 0.006688 7 R_NET_MIG_2015 0.006421 8 R_DOMESTIC_MIG_2013 0.004869 9 R_NET_MIG_2014 0.004762 10 R_DOMESTIC_MIG_2011 0.001445 11 R_NET_MIG_2016 0.000589
# Afficher les résultats
sorted_idx = result.importances_mean.argsort()
plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), result.importances_mean[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), X.columns[sorted_idx])
plt.xlabel('Importance de permutation')
plt.title('Importance de permutation des variables')
plt.show()
Il s'agit des caractéristiques qui ont montré une contribution positive significative à la capacité prédictive du modèle logistique.
# Caractéristiques d'un exemple spécifique
i = 6
X_test.values[i]
array([3756. , 9.2 , 18.4 , 12. ,
24.2 , 8. , 31.7 , 18.00604954,
13.35543452, 24.23311774, 20.11557361, 18.11984212])
import shap
# Initialisation du Javascript
shap.initjs()
# Création de l'explainer SHAP
explainer_shap = shap.TreeExplainer(pipeline_rf.named_steps['random_forest'])
# Obtenir la valeur prédite du modèle
predicted_value = pipeline_rf.predict(X_test.values[i].reshape(1, -1))
# Afficher la valeur prédite du modèle
print("Valeur prédite du modèle:", predicted_value[0])
# Obtenir les valeurs SHAP pour un exemple spécifique
shap_values = explainer_shap.shap_values(X_test.values[i])
# Afficher les valeurs SHAP
shap.force_plot(explainer_shap.expected_value[1], shap_values[1], feature_names=X_train_full.columns)
Valeur prédite du modèle: 0.0
le modèle prédit le parti politique Démocrate vainqueur pour les caractéristique de l'index 6. On peut visualiser sur le graphique, sous la barre bleu les features qui ont influencés sa prédiction.